---
title: "Combining Data, Binary Assignment, Counting Faces"
author: "Zachary Steinert-Threlkeld"
date: "03.14.2018"
output:
  pdf_document: default
  html_notebook: default
  html_document: default
  toc: yes
  toc_depth: 3
---
<!-- 
This script takes the raw data and adds demographic metadata as well as some binary lbabels.  
-->

```{r setup, include=TRUE}
#knitr::opts_chunk$set(echo = TRUE)
#knitr::opts_knit$set(root.dir ='<path/to/Replication/>')
#library(knitr)
knitr::opts_chunk$set(tidy.opts=list(width.cutoff=80),tidy=TRUE)
```
The purpose of this script is to process the raw classifier output.  It first merges the raw data into dataframe.  It then takes the raw data and assigns a binary indicator of whether or not the image is actually of the category, based on manual inspection of the distribution provided by Donghyeon Won.  These files are then saved and aggregated in a separate Python file (it is easier with pandas in Python).

# Count Faces
This section analyzes the face data and creates a dataset from it.  I want to exclude the tweets from the Women's March.

```{r, error=TRUE, cache=FALSE, eval=TRUE, echo=TRUE}
library(entropy)
library(dplyr)

options(scipen=999)  # No scientific notation

#setwd('<path/to/Replication/>')
files <- list.files('./Data/01_rawData/', full.names=TRUE)
files <- files[grep('DonghyeonAlex', files)]  # Get the merged file

```


Aggregate face predictions by tweet.  Filter by s50_c1_filter for everything but crowd size.  
```{r, error=TRUE, cache=TRUE, eval=TRUE, echo=TRUE}
faces <- read.csv('Data/01_rawData/shortSpain_prediction_face_screened_s50_c1_7453.csv', stringsAsFactors=FALSE)

faces <- data.frame(faces %>% group_by(tweet_id) %>% mutate(totalFaces=max(face_num)+1))  # Number of faces per photo

faces_classify <- faces[faces$s50_c1_filter=='True',]  # Do this step because the model was trained on faces of at 50 pixels by 50 pixels.  This filter keeps those.

faces_classify2 <- data.frame(faces_classify %>% group_by(tweet_id) %>% summarize(raceWhite=sum(grepl('White', race_str)), 
raceIndian = sum(grepl('Indian', race_str)),                                                        raceLatino = sum(grepl('Latino', race_str)), 
raceMiddleEast = sum(grepl('Middle East', race_str)), 
raceBlack = sum(grepl('Black', race_str)), 
raceEastAsian = sum(grepl('East Asian', race_str)), 
raceSEAsian = sum(grepl('SE Asian', race_str)),
facesMale = sum(grepl('Male', gender_str)),
facesFemale = sum(grepl('Female', gender_str)),
faces0_2 = sum(grepl('0-2', age_str)),
faces3_9 = sum(grepl('3-9', age_str)),
faces10_19 = sum(grepl('10-19', age_str)),
faces20_29 = sum(grepl('20-29', age_str)),
faces30_39 = sum(grepl('30-39', age_str)),
faces40_49 = sum(grepl('40-49', age_str)),
faces50_59 = sum(grepl('50-59', age_str)),
faces60_69 = sum(grepl('60-69', age_str)),
faces70plus = sum(grepl('70+', age_str))))
```



Combine by country.
```{r, error=TRUE, cache=TRUE, eval=TRUE, echo=TRUE}
data <- NULL

facesUnique <- faces[duplicated(faces$tweet_id)==FALSE,]  # Needed to make merging work
faces2$id <- faces2$tweet_id

for(i in 1:length(files)){
  print(paste('On ', files[i]))
  temp <- read.csv(files[i], stringsAsFactors = FALSE)
  
  # A bug in earlier script made a bad column name.  This line will fix; that bug is now fixed, so future iterations won't need this line.  I keep just to be safe.
  if('rc.cc' %in% names(temp)){
    temp$rg.cc <- temp$rc.cc
    temp <- temp[,!names(temp) %in% c('rc.cc')]
  }
  
  if('X' %in% names(temp)){
    temp <- temp[,!names(temp) %in% c('X')]
  }
  
  names(temp)[names(temp) == 'text'] <- 'text_x'
  names(temp)[names(temp) == 'text_new'] <- 'text_y'

  # Get total faces in tweet
  temp <- merge(temp, facesUnique[,c('tweet_id', 'totalFaces')], by.x='id', by.y='tweet_id', all.x=TRUE, all.y=FALSE)
  
  # Get other face information by tweet
  temp <- merge(temp, faces_classify2, by.x='id', by.y='tweet_id', all.x=TRUE, all.y=FALSE)
  
  
  # Drop "face." columns, is vestigial from Donghyeon.
  temp <- temp[,grep('face.', substr(names(temp), 1, 5), fixed=TRUE, invert=TRUE)]
  
  # Add entropy measure of race.  This is by photo, will later add by day.
  temp$entropyRace <- apply(cbind(temp$raceWhite, temp$raceIndian, temp$raceLatino, temp$raceMiddleEast, temp$raceBlack, temp$raceEastAsian, temp$raceSEAsian), MARGIN=1, function(x) {entropy(x)})

  # Add entropy measure of gender.  This is by photo, will later add by day.
  temp$entropyGender <- apply(cbind(temp$facesMale, temp$facesFemale), MARGIN=1, function(x) {entropy(x)})

  # Entropy age
  temp$entropyAge <- apply(cbind(temp$faces0_2,temp$faces3_9,temp$faces10_19,temp$faces20_29,temp$faces30_39,temp$faces40_49,temp$faces50_59,temp$faces60_69,temp$faces70plus), MARGIN=1, function(x) {entropy(x)})
  
  # Combine
  data <- rbind(temp, data)
}
```


Now, save the data.  Note different out name.

```{r, error=TRUE, cache=TRUE, eval=TRUE, echo=TRUE}
write.csv(data, './Data/02_processedData/a_DonghyeonAlexmerged_NewclassifiersWithFaces.csv')
```



# Dichotomize Ratings
This section will take the raw data and create dichotomous variables for if an image contains what it is classified as.  

```{r, error=TRUE, cache=TRUE, eval=TRUE, echo=TRUE}
data <- read.csv('./Data/02_processedData/a_DonghyeonAlexmerged_NewclassifiersWithFaces.csv', stringsAsFactors=FALSE)

thresholds <- data.frame(protest_result.children=.15, protest_result.fire=.37, protest_result.flag=.187, protest_result.group_100=.509, protest_result.group_20=.725, protest_result.night=.359, protest_result.photo=.815, protest_result.police=.937, protest_result.protest=.826, protest_result.shouting=.355, protest_result.sign=.744, protest_result.violence=.67, protest_result.protester_violence=.021, protest_result.state_violence=.01)

data$binary_children <- ifelse(data$protest_result.children >= thresholds$protest_result.children, 1, 0)
data$binary_fire <- ifelse(data$protest_result.fire >= thresholds$protest_result.fire, 1, 0)
data$binary_flag <- ifelse(data$protest_result.flag >= thresholds$protest_result.flag, 1, 0)
data$binary_group_100 <- ifelse(data$protest_result.group_100 >= thresholds$protest_result.group_100, 1, 0)
data$binary_group_20 <- ifelse(data$protest_result.group_20 >= thresholds$protest_result.group_20, 1, 0)
data$binary_night <- ifelse(data$protest_result.night >= thresholds$protest_result.night, 1, 0)
data$binary_photo <- ifelse(data$protest_result.photo >= thresholds$protest_result.photo, 1, 0)
data$binary_police <- ifelse(data$protest_result.police >= thresholds$protest_result.police, 1, 0)
data$binary_protest <- ifelse(data$protest_result.protest >= thresholds$protest_result.protest, 1, 0)
data$binary_shouting <- ifelse(data$protest_result.shouting >= thresholds$protest_result.shouting, 1, 0)
data$binary_sign <- ifelse(data$protest_result.sign >= thresholds$protest_result.sign, 1, 0)
data$binary_violence <- ifelse(data$protest_result.violence >= thresholds$protest_result.violence, 1, 0)
data$binary_protester_violence <- ifelse(data$protest_result.protester_violence >= thresholds$protest_result.protester_violence, 1, 0)
data$binary_state_violence <- ifelse(data$protest_result.state_violence >= thresholds$protest_result.state_violence, 1, 0)

data$binary_groupAny <- ifelse((data$binary_group_100 + data$binary_group_20) > 0, 1, 0)
data$binary_groupChildren <- ifelse(data$binary_groupAny*data$binary_children == 1, 1, 0)
data$binary_ViolencePolice <- ifelse(data$binary_police*data$binary_violence == 1, 1, 0)
data$binary_ViolenceFire <- ifelse(data$binary_violence*data$binary_fire == 1, 1, 0)
data$binary_ViolencePoliceFire <- ifelse(data$binary_violence*data$binary_fire*data$binary_police == 1, 1, 0)
```

Before saving, add some basic extra data that will be needed.
```{r}
data$tweets <- 1
data$country <- data$place.country_code
write.csv(data, './Data/02_processedData/b_DonghyeonAlexmerged_NewclassifiersWithBinary.csv')
```

# Narrow Down Country Locations
Now, remove the parts of Spain that are provinces not including Catalonia.
```{r}
table(data$state[data$cc=='ES'])


data_short <- data[data$cc=='ES',]
data_short <- data_short[data_short$state == 'Catalonia',]

data <- data[data$cc != 'ES',]

data <- rbind(data, data_short)

write.csv(data, './Data/02_processedData/c_DonghyeonAlexmerged_NewclassifiersShortSpain.csv')
```



# Colombia?
NB: The below does not appear to matter as of 03.14.2019.

Exploring the tweets from Colombia, which may be from Venezuela.

```{r}
colombia <- data[data$cc == 'CO',]

table(colombia$state)
table(colombia$city)
```

That is a border city.  So, it looks like these are misclassified and should be Venezuela.

```{r}
data$cc[data$cc == 'CO'] <- 'VE'
data$country[data$country == 'CO'] <- 'VE'

table(data$cc)  # Great, it is fixed.
```

# Save data
Save the data out.  These data will be used for aggregating.  Make place.bounding_box.coordinates factor so Python does not read as string.

```{r}
data$place.bounding_box.coordinates <- as.factor(data$place.bounding_box.coordinates)
write.csv(data, './Data/02_processedData/c_DonghyeonAlexmerged_Newclassifiers_ShortSpain.csv')
```
